Matrix plot of platform data (year, skill ratings, etc.) and demographic data (age, gender, etc.) with correlation coefficients.
add.covars <- function(x, logs = FALSE) {
covars <- with(races, data.frame(year = 2015 - year, rating,
nreg, nsub, algo_rating = algo_rating/100, algo_nreg,
algo_nsub, paid, nwins = nwins > 0, ntop5 = ntop5 > 0,
ntop10 = ntop10 > 0, risk, hours, timezone.dist = abs(timezone +
5), male, postgrad = ifelse(educ == "Postgraduate (MA)" |
educ == "Phd", 1, 0), below30 = ifelse(age == "<20" |
age == "20-25" | age == "26-30", 1, 0)))
controls <- aggregate(covars, by = with(races, list(room_id = room_id)),
mean, na.rm = TRUE)
out <- merge(x, controls)
out$lpaid <- log(out$paid) # Use logs for paid
out$paid <- NULL
if (logs) {
out$lpaid <- log(out$paid)
out$lhours <- log(out$hours)
out$lnreg <- log(out$nreg)
}
out$nwins <- out$nwins > 0
return(out)
}
races$n <- ave(races$submit, races$room_id, FUN = length)
entry <- add.covars(aggregate(submit ~ n + room_id + treatment +
room_size, data = races, sum))
summary(entry)## room_id n treatment room_size submit
## race 1 : 1 Min. : 9.00 race :8 Large:12 Min. :1.000
## race 2 : 1 1st Qu.:10.00 tournament:8 Small:12 1st Qu.:2.750
## race 3 : 1 Median :12.50 reserve :8 Median :4.000
## race 4 : 1 Mean :12.46 Mean :3.583
## race 5 : 1 3rd Qu.:15.00 3rd Qu.:5.000
## race 6 : 1 Max. :15.00 Max. :6.000
## (Other):18
## year rating nreg nsub
## Min. :3.000 Min. :11.37 Min. : 3.70 Min. : 1.300
## 1st Qu.:4.350 1st Qu.:12.43 1st Qu.:12.00 1st Qu.: 4.675
## Median :4.950 Median :13.17 Median :16.97 Median : 7.350
## Mean :5.037 Mean :13.37 Mean :17.21 Mean : 7.036
## 3rd Qu.:5.633 3rd Qu.:14.37 3rd Qu.:22.65 3rd Qu.: 9.033
## Max. :7.200 Max. :15.93 Max. :28.53 Max. :12.800
##
## algo_rating algo_nreg algo_nsub nwins
## Min. : 7.045 Min. :16.00 Min. :14.10 Mode :logical
## 1st Qu.: 9.296 1st Qu.:36.38 1st Qu.:31.82 FALSE:9
## Median :10.780 Median :45.45 Median :39.75 TRUE :15
## Mean :10.547 Mean :46.16 Mean :40.96 NA's :0
## 3rd Qu.:11.544 3rd Qu.:59.33 3rd Qu.:54.09
## Max. :12.991 Max. :73.00 Max. :65.70
##
## ntop5 ntop10 risk hours
## Min. :0.0000 Min. :0.0000 Min. :5.357 Min. :18.29
## 1st Qu.:0.1833 1st Qu.:0.2000 1st Qu.:6.178 1st Qu.:26.05
## Median :0.2667 Median :0.3000 Median :6.333 Median :30.07
## Mean :0.2222 Mean :0.3046 Mean :6.445 Mean :31.47
## 3rd Qu.:0.3083 3rd Qu.:0.4000 3rd Qu.:6.851 3rd Qu.:37.78
## Max. :0.4000 Max. :0.5333 Max. :7.625 Max. :45.13
##
## timezone.dist male postgrad below30
## Min. : 5.125 Min. :0.7500 Min. :0.1429 Min. :0.4000
## 1st Qu.: 6.475 1st Qu.:0.9000 1st Qu.:0.2500 1st Qu.:0.5714
## Median : 7.537 Median :1.0000 Median :0.4000 Median :0.6333
## Mean : 7.653 Mean :0.9482 Mean :0.3607 Mean :0.6787
## 3rd Qu.: 8.710 3rd Qu.:1.0000 3rd Qu.:0.4381 3rd Qu.:0.8115
## Max. :10.556 Max. :1.0000 Max. :0.5556 Max. :0.9000
##
## lpaid
## Min. : 8.052
## 1st Qu.: 9.143
## Median :10.009
## Mean :10.134
## 3rd Qu.:11.090
## Max. :12.293
##
entry.lm <- glm(log(submit/n) ~ treatment + room_size, data = entry)
summary(entry.lm)##
## Call:
## glm(formula = log(submit/n) ~ treatment + room_size, data = entry)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.83222 -0.24266 -0.01059 0.26639 0.80156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.37050 0.16875 -8.122 9.22e-08 ***
## treatmenttournament 0.30255 0.20667 1.464 0.159
## treatmentreserve 0.02434 0.20667 0.118 0.907
## room_sizeSmall -0.12421 0.16875 -0.736 0.470
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.170851)
##
## Null deviance: 3.9617 on 23 degrees of freedom
## Residual deviance: 3.4170 on 20 degrees of freedom
## AIC: 31.326
##
## Number of Fisher Scoring iterations: 2
layout(matrix(c(1, 2, 3, 4), 2, 2))
plot(entry.lm)entry.lm.all <- glm(log(submit/n) ~ ., data = entry[, -1])
summary(entry.lm.all)##
## Call:
## glm(formula = log(submit/n) ~ ., data = entry[, -1])
##
## Deviance Residuals:
## 1 2 3 4 5 6
## 0.045293 -0.029772 -0.004308 0.056464 0.052742 -0.017453
## 7 8 9 10 11 12
## -0.089327 -0.013639 -0.035707 0.022246 -0.039607 -0.027303
## 13 14 15 16 17 18
## 0.051768 0.057219 0.007678 -0.036293 -0.014274 -0.000146
## 19 20 21 22 23 24
## 0.050537 -0.023424 0.056149 -0.007597 -0.017957 -0.043289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.72554 4.72104 -1.848 0.1617
## treatmenttournament 0.58895 0.15719 3.747 0.0332 *
## treatmentreserve 0.11446 0.16704 0.685 0.5424
## room_sizeSmall -0.20902 0.15746 -1.327 0.2764
## year 0.63942 0.23953 2.670 0.0757 .
## rating 0.33075 0.09302 3.556 0.0379 *
## nreg -0.14251 0.07454 -1.912 0.1518
## nsub 0.31100 0.18484 1.683 0.1911
## algo_rating -0.12715 0.04227 -3.008 0.0573 .
## algo_nreg 0.10868 0.02399 4.530 0.0201 *
## algo_nsub -0.11131 0.02734 -4.072 0.0267 *
## nwinsTRUE -0.64568 0.19798 -3.261 0.0471 *
## ntop5 -1.25284 1.06177 -1.180 0.3231
## ntop10 0.97479 0.90602 1.076 0.3608
## risk -0.85303 0.16385 -5.206 0.0138 *
## hours -0.01399 0.01564 -0.895 0.4369
## timezone.dist -0.05276 0.04517 -1.168 0.3272
## male 4.61821 2.34450 1.970 0.1435
## postgrad -2.66143 1.23093 -2.162 0.1193
## below30 4.61756 1.74298 2.649 0.0770 .
## lpaid 0.09389 0.12870 0.730 0.5185
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.01250787)
##
## Null deviance: 3.961652 on 23 degrees of freedom
## Residual deviance: 0.037524 on 3 degrees of freedom
## AIC: -42.951
##
## Number of Fisher Scoring iterations: 2
layout(matrix(c(1, 2, 3, 4), 2, 2))
plot(entry.lm.all)entry.lm.some <- update(entry.lm, "~ . + rating+nreg+nwins+hours+below30+risk+year+timezone.dist")
summary(entry.lm.some)##
## Call:
## glm(formula = log(submit/n) ~ treatment + room_size + rating +
## nreg + nwins + hours + below30 + risk + year + timezone.dist,
## data = entry)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.36135 -0.13818 0.00601 0.13666 0.36092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.629433 1.683283 -2.156 0.05207 .
## treatmenttournament 0.451951 0.145178 3.113 0.00897 **
## treatmentreserve 0.254329 0.219926 1.156 0.27001
## room_sizeSmall -0.124668 0.136684 -0.912 0.37968
## rating 0.136975 0.054946 2.493 0.02828 *
## nreg 0.015475 0.020545 0.753 0.46584
## nwinsTRUE -0.566535 0.182752 -3.100 0.00919 **
## hours 0.011408 0.009418 1.211 0.24909
## below30 2.081632 0.701793 2.966 0.01178 *
## risk -0.467709 0.118771 -3.938 0.00197 **
## year 0.184912 0.136793 1.352 0.20138
## timezone.dist 0.091483 0.043232 2.116 0.05592 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07534486)
##
## Null deviance: 3.96165 on 23 degrees of freedom
## Residual deviance: 0.90414 on 12 degrees of freedom
## AIC: 15.417
##
## Number of Fisher Scoring iterations: 2
layout(matrix(c(1, 2, 3, 4), 2, 2))
plot(entry.lm.some)“If we do go ahead and use the same data twice, once to pick a model and once to test hypotheses about that model, we will get confidence intervals which are systematically too narrow, p-values which are systematically too small, etc.” – Shalizi (2015)
i <- sample(1:nrow(entry), size = nrow(entry)/2)
j <- c("submit", "n", "treatment", "room_size", "year", "rating",
"nreg", "nwins", "risk", "hours", "below30")
entry.lm.step.half <- step(update(entry.lm.all, data = entry[,
j], subset = i))## Start: AIC=-3.54
## log(submit/n) ~ treatment + room_size + year + rating + nreg +
## nwins + risk + hours + below30
##
## Df Deviance AIC
## - nwins 1 0.070820 -5.5356
## - hours 1 0.071134 -5.4827
## - room_size 1 0.071315 -5.4521
## <none> 0.070789 -3.5410
## - rating 1 0.102555 -1.0927
## - below30 1 0.106360 -0.6555
## - nreg 1 0.196700 6.7227
## - year 1 0.197366 6.7633
## - treatment 2 0.289514 9.3610
## - risk 1 0.310764 12.2110
##
## Step: AIC=-5.54
## log(submit/n) ~ treatment + room_size + year + rating + nreg +
## risk + hours + below30
##
## Df Deviance AIC
## - hours 1 0.07113 -7.4825
## - room_size 1 0.07167 -7.3930
## <none> 0.07082 -5.5356
## - rating 1 0.10365 -2.9651
## - below30 1 0.12787 -0.4451
## - year 1 0.21538 5.8116
## - nreg 1 0.36103 12.0101
## - treatment 2 0.44172 12.4308
## - risk 1 0.60518 18.2089
##
## Step: AIC=-7.48
## log(submit/n) ~ treatment + room_size + year + rating + nreg +
## risk + below30
##
## Df Deviance AIC
## - room_size 1 0.07167 -9.3924
## <none> 0.07113 -7.4825
## - rating 1 0.10744 -4.5338
## - below30 1 0.16161 0.3652
## - year 1 0.23773 4.9962
## - nreg 1 0.36497 10.1404
## - treatment 2 0.49325 11.7548
## - risk 1 0.62918 16.6755
##
## Step: AIC=-9.39
## log(submit/n) ~ treatment + year + rating + nreg + risk + below30
##
## Df Deviance AIC
## <none> 0.07167 -9.3924
## - rating 1 0.11893 -5.3151
## - below30 1 0.19228 0.4499
## - year 1 0.37856 8.5790
## - nreg 1 0.56673 13.4212
## - treatment 2 0.97972 17.9898
## - risk 1 1.03529 20.6518
summary(entry.lm.step.half)##
## Call:
## glm(formula = log(submit/n) ~ treatment + year + rating + nreg +
## risk + below30, data = entry[, j], subset = i)
##
## Deviance Residuals:
## 9 5 13 23 6 24
## -0.083924 -0.108213 0.041056 -0.020625 0.083978 0.124433
## 8 22 2 7 4 11
## -0.084223 -0.103808 0.028811 0.087581 -0.007934 0.042868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.27446 1.40170 3.049 0.03805 *
## treatmenttournament 0.55359 0.14011 3.951 0.01680 *
## treatmentreserve 1.69520 0.25943 6.534 0.00283 **
## year 0.40254 0.09727 4.139 0.01439 *
## rating 0.10046 0.06186 1.624 0.17969
## nreg -0.08935 0.01700 -5.256 0.00627 **
## risk -1.03669 0.14136 -7.333 0.00184 **
## below30 -1.98306 0.76435 -2.594 0.06040 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.01791772)
##
## Null deviance: 1.878654 on 11 degrees of freedom
## Residual deviance: 0.071671 on 4 degrees of freedom
## AIC: -9.3924
##
## Number of Fisher Scoring iterations: 2
model <- formula(entry.lm.step.half)
entry.lm <- lm(model, data = entry, subset = setdiff(1:nrow(entry),
i))
summary(entry.lm)##
## Call:
## lm(formula = model, data = entry, subset = setdiff(1:nrow(entry),
## i))
##
## Residuals:
## 1 3 10 12 14 15 16 17
## 0.01272 -0.01272 0.25782 -0.09511 0.34791 -0.46651 -0.04411 -0.10343
## 18 19 20 21
## 0.09757 0.17755 -0.33359 0.16191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.45710 3.30825 -0.440 0.682
## treatmenttournament -0.07487 0.39531 -0.189 0.859
## treatmentreserve -0.42106 0.43889 -0.959 0.392
## year -0.03736 0.28372 -0.132 0.902
## rating 0.07786 0.09276 0.839 0.448
## nreg 0.03515 0.03560 0.987 0.379
## risk -0.35098 0.20901 -1.679 0.168
## below30 1.79145 1.52905 1.172 0.306
##
## Residual standard error: 0.3891 on 4 degrees of freedom
## Multiple R-squared: 0.582, Adjusted R-squared: -0.1495
## F-statistic: 0.7956 on 7 and 4 DF, p-value: 0.6296
require(MASS)
entry.rlm <- rlm(log(submit/n) ~ ., data = entry[, -1], maxit = 50)
summary(entry.rlm)##
## Call: rlm(formula = log(submit/n) ~ ., data = entry[, -1], maxit = 50)
## Residuals:
## 1 2 3 4 5 6
## 2.018e-05 -1.085e-05 -5.233e-06 2.170e-05 3.835e-02 -1.765e-05
## 7 8 9 10 11 12
## -3.617e-01 -8.155e-06 -1.857e-05 -1.938e-06 -6.403e-06 -9.972e-06
## 13 14 15 16 17 18
## 2.456e-05 1.616e-05 3.041e-06 -6.878e-06 -4.001e-06 -5.766e-06
## 19 20 21 22 23 24
## 2.236e-05 -1.513e-06 5.671e-02 8.271e-07 -4.571e-05 -2.038e-05
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -12.8115 0.0032 -3965.2505
## treatmenttournament 0.6989 0.0001 6496.3384
## treatmentreserve 0.1529 0.0001 1337.6861
## room_sizeSmall -0.3272 0.0001 -3035.9294
## year 0.8194 0.0002 4998.7459
## rating 0.4020 0.0001 6314.0938
## nreg -0.1933 0.0001 -3788.3626
## nsub 0.4586 0.0001 3625.4356
## algo_rating -0.0994 0.0000 -3434.8618
## algo_nreg 0.1429 0.0000 8701.8337
## algo_nsub -0.1502 0.0000 -8026.5545
## nwinsTRUE -0.8281 0.0001 -6112.0504
## ntop5 -1.3229 0.0007 -1820.6254
## ntop10 -0.1504 0.0006 -242.5310
## risk -0.9730 0.0001 -8677.0773
## hours -0.0184 0.0000 -1716.8155
## timezone.dist -0.0374 0.0000 -1208.6091
## male 6.0207 0.0016 3752.3557
## postgrad -3.0690 0.0008 -3643.1237
## below30 5.9835 0.0012 5016.1328
## lpaid 0.1885 0.0001 2139.9692
##
## Residual standard error: 2.615e-05 on 3 degrees of freedom
require(MASS)
entry.rlm <- rlm(log(submit/n) ~ treatment + room_size + rating +
nreg + nwins + hours + below30 + risk + year + timezone.dist,
data = entry[, -1], maxit = 50)
summary(entry.rlm)##
## Call: rlm(formula = log(submit/n) ~ treatment + room_size + rating +
## nreg + nwins + hours + below30 + risk + year + timezone.dist,
## data = entry[, -1], maxit = 50)
## Residuals:
## Min 1Q Median 3Q Max
## -0.411982 -0.111404 0.007527 0.136716 0.393944
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -3.7919 1.7145 -2.2117
## treatmenttournament 0.4592 0.1479 3.1054
## treatmentreserve 0.2790 0.2240 1.2456
## room_sizeSmall -0.1611 0.1392 -1.1568
## rating 0.1343 0.0560 2.4000
## nreg 0.0167 0.0209 0.7978
## nwinsTRUE -0.5956 0.1861 -3.1999
## hours 0.0126 0.0096 1.3127
## below30 2.2232 0.7148 3.1102
## risk -0.4800 0.1210 -3.9678
## year 0.2180 0.1393 1.5645
## timezone.dist 0.0894 0.0440 2.0295
##
## Residual standard error: 0.1828 on 12 degrees of freedom
# Principal component
pca <- prcomp(entry[, -c(1:5)], center = TRUE, scale = TRUE)
# summary(pca)
par(mfrow = c(1, 2))
plot(pca)
biplot(pca)# Predict
pca.hat <- predict(pca)
entry$pca1 <- pca.hat[, 1]
entry$pca2 <- pca.hat[, 2]
entry$pca3 <- pca.hat[, 3]
entry$pca4 <- pca.hat[, 4]
entry$pca5 <- pca.hat[, 5] # 74 % of variancem <- rep()
m$m1 <- update(entry.lm, "~.+pca1")
m$m2 <- update(entry.lm, "~.+pca1+pca2")
m$m3 <- update(entry.lm, "~.+pca1+pca2+pca3")
m$m4 <- update(entry.lm, "~.+pca1+pca2+pca3+pca4")
m$m5 <- update(entry.lm, "~.+pca1+pca2+pca3+pca4+pca5")
stargazer(entry.lm, m, type = "text", digits = 2)##
## ====================================================================================================
## Dependent variable:
## --------------------------------------------------------------------------------
## log(submit/n)
## (1) (2) (3) (4) (5) (6)
## ----------------------------------------------------------------------------------------------------
## treatmenttournament -0.07 -0.12 -0.08 -0.08 0.66 0.66
## (0.40) (0.49) (0.62) (0.87)
##
## treatmentreserve -0.42 -0.53 -0.38 -0.47 -7.25 -7.25
## (0.44) (0.69) (1.08) (1.82)
##
## year -0.04 -0.13 0.03 0.03 0.36 0.36
## (0.28) (0.52) (0.95) (1.33)
##
## rating 0.08 0.06 0.12 0.17 0.85 0.85
## (0.09) (0.15) (0.35) (0.73)
##
## nreg 0.04 0.04 0.03 0.02 1.37 1.37
## (0.04) (0.04) (0.07) (0.13)
##
## risk -0.35 -0.35 -0.53 -0.53 -5.76 -5.76
## (0.21) (0.24) (0.89) (1.25)
##
## below30 1.79 1.95 1.54 1.71 -1.12 -1.12
## (1.53) (1.89) (2.92) (4.51)
##
## pca1 0.06 -0.01 0.06 -2.35 -2.35
## (0.26) (0.45) (1.01)
##
## pca2 0.09 0.13 2.67 2.67
## (0.38) (0.74)
##
## pca3 0.09 -1.02 -1.02
## (0.95)
##
## pca4 -5.37 -5.37
##
##
## pca5
##
##
## Constant -1.46 -0.81 -0.97 -1.62 1.07 1.07
## (3.31) (4.80) (5.86) (10.79)
##
## ----------------------------------------------------------------------------------------------------
## Observations 12 12 12 12 12 12
## R2 0.58 0.59 0.60 0.60 1.00 1.00
## Adjusted R2 -0.15 -0.51 -1.21 -3.38
## Residual Std. Error 0.39 (df = 4) 0.45 (df = 3) 0.54 (df = 2) 0.76 (df = 1)
## F Statistic 0.80 (df = 7; 4) 0.54 (df = 8; 3) 0.33 (df = 9; 2) 0.15 (df = 10; 1)
## ====================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
We start with differences in the empirical proportions.
entry <- xtabs(submit ~ treatment + room_size, data = aggregate(submit ~
treatment + room_size, races, mean))
knitr::kable(round(100 * entry, 1), caption = "Participation rates by competition style and room size")| Large | Small | |
|---|---|---|
| race | 26.7 | 25.6 |
| tournament | 30.0 | 37.5 |
| reserve | 30.0 | 22.5 |
We first study the impact of competition style on entry.
We regress all variables against treatment and room size dummies.
rating.lm <- lm(rating ~ submit * treatment + room_size, data = races)
rating.sum <- summary(rating.lm)
extract_info <- function(object) {
est <- round(object$coef[, 1], 2)
se <- paste("(", round(object$coef[, 2], 2), ")", sep = "")
vec <- as.vector(rbind(est, se))
vec.info <- rep(c("Estimates", "St.err"), length(est))
vec.names <- rep(names(est), each = 2)
data.frame(names = vec.names, info = vec.info, values = vec)
}
merge_info <- function(x, y) merge(x, y, by = c("names", "info"),
all = TRUE)
regmat <- function(x) {
models_l <- lapply(x, function(x) extract_info(summary(x)))
Reduce(merge_info, models_l)
}
summary(lm(rating ~ submit * treatment + room_size, data = races))##
## Call:
## lm(formula = rating ~ submit * treatment + room_size, data = races)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9261 -2.8764 -0.4474 2.2320 15.4873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.3809 0.6464 19.155 <2e-16 ***
## submitTRUE 2.1252 1.0574 2.010 0.0458 *
## treatmenttournament -0.4536 0.8666 -0.523 0.6013
## treatmentreserve 0.5097 0.8775 0.581 0.5620
## room_sizeSmall 0.3066 0.5865 0.523 0.6017
## submitTRUE:treatmenttournament 1.0649 1.4636 0.728 0.4677
## submitTRUE:treatmentreserve -1.6246 1.4876 -1.092 0.2761
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.155 on 203 degrees of freedom
## (89 observations deleted due to missingness)
## Multiple R-squared: 0.06715, Adjusted R-squared: 0.03957
## F-statistic: 2.435 on 6 and 203 DF, p-value: 0.02699
coefplot <- function(object, ...) {
object.sum <- summary(object)
est <- object.sum$coef[-1, 1]
se <- object.sum$coef[-1, 2]
xlim <- range(c(est + 2 * se, est - 2 * se))
plot(y = 1:length(est), x = est, xlim = xlim, ...)
segments(x0 = est + se, x1 = est - se, y0 = 1:length(est),
lwd = 2)
segments(x0 = est + 2 * se, x1 = est - 2 * se, y0 = 1:length(est))
}
rating.lm <- lm(rating ~ submit * I(treatment == "race") + room_size,
data = races)
vars <- c("year", "rating", "nreg", "nsub", "nwins", "ntop10",
"hours", "timezone")
par(mfrow = c(3, 3))
models <- lapply(vars, function(x) {
coefplot(update(rating.lm, paste(x, "~.")), pch = 16, bty = "n",
yaxt = "n")
abline(v = 0, lty = 3)
title(x)
})
m <- regmat(models)## Error in object$coef: $ operator is invalid for atomic vectors
mt <- t(m)
rownames(mt) <- c("names", "info", vars)## Error in `rownames<-`(`*tmp*`, value = c("names", "info", "year", "rating", : length of 'dimnames' [1] not equal to array extent
mt[-c(1, 2), ]## m1 m2 m3 m4 m5
We assume the conditional mean of the proportion of entrants \(Y_{ij}=E_{ij}/N_{j}\) observed in a room randomly assigned to the \(i\)th treatment and of room size \(N_j\) is \[ E[Y_{ij}=y \mid x] = \text{competition}_{i} + \text{size}_{j} \]
The model is fully specified with \(var(Y\mid x)\) assumed normal and costant.
races$n <- ave(races$submit, races$room_id, FUN = length)
rooms <- aggregate(submit ~ room + room_size + n + treatment,
data = races, sum)
rooms$y <- rooms$submit/rooms$n
rooms.lm <- lm(y ~ treatment + room_size, data = rooms)
# rooms.lm <- lm(log(y) ~ treatment+room_size, data=rooms)
summary(rooms.lm)##
## Call:
## lm(formula = y ~ treatment + room_size, data = rooms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.160648 -0.063310 -0.006019 0.046759 0.240741
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.262963 0.044849 5.863 9.79e-06 ***
## treatmenttournament 0.076389 0.054928 1.391 0.180
## treatmentreserve 0.001389 0.054928 0.025 0.980
## room_sizeSmall -0.003704 0.044849 -0.083 0.935
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1099 on 20 degrees of freedom
## Multiple R-squared: 0.1127, Adjusted R-squared: -0.02043
## F-statistic: 0.8465 on 3 and 20 DF, p-value: 0.4846
This model shows that the difference is significant (one-sided) but the model is not significant F-test. We try with the logarithm.
rooms.lm.log <- lm(log(y) ~ treatment + room_size, data = rooms)
summary(rooms.lm.log)##
## Call:
## lm(formula = log(y) ~ treatment + room_size, data = rooms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.83222 -0.24266 -0.01059 0.26639 0.80156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.37050 0.16875 -8.122 9.22e-08 ***
## treatmenttournament 0.30255 0.20667 1.464 0.159
## treatmentreserve 0.02434 0.20667 0.118 0.907
## room_sizeSmall -0.12421 0.16875 -0.736 0.470
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4133 on 20 degrees of freedom
## Multiple R-squared: 0.1375, Adjusted R-squared: 0.008097
## F-statistic: 1.063 on 3 and 20 DF, p-value: 0.3871
Robust linear regression model.
library(MASS)
rooms.rlm <- rlm(y ~ treatment + room_size, data = rooms)
mat <- coef(summary(rooms.rlm))
round(cbind(mat, pval = 2 * (1 - pt(abs(mat[, 3]), df = 20))),
3)## Value Std. Error t value pval
## (Intercept) 0.256 0.044 5.790 0.00
## treatmenttournament 0.072 0.054 1.327 0.20
## treatmentreserve 0.023 0.054 0.418 0.68
## room_sizeSmall -0.025 0.044 -0.563 0.58
Model checking. Residuals do not look normal.
res <- resid(rooms.lm) ## Raw residuals
lev <- hatvalues(rooms.lm) ## Leverages
res.mo <- res/sqrt(1 - lev) ## Modified residuals
# Diagnostic plots
par(mfrow = c(1, 2))
qqnorm(res.mo, pch = 16, xlab = "Quantiles Standard Normal",
ylab = "Modified residuals", main = "")
qqline(res.mo, pch = 16)
plot(lev, res.mo, pch = 16, xlab = "Leverage h", ylab = "Modified residuals")After resampling errors, we find only small differences between bootstrapped and asymptotic estimates of coefficients’ standard errors.
rooms$res <- resid(rooms.lm)
rooms$fitted <- predict(rooms.lm)
f <- function(data, i, object) {
d <- data
d$y <- d$fitted + d$res[i]
lm.res <- update(object, data = d)
c(coef(lm.res), sigma(lm.res))
}
rooms.boot <- boot(f, R = 499, data = rooms, object = rooms.lm)
plot(rooms.boot, index = 2)SE <- apply(rooms.boot$t, 2, sd) # St. errors compared to 0.096 and 0.0285
round(cbind(boot = SE, approx = c(coef(summary(rooms.lm))[, 2],
err = sigma(rooms.lm))), 3)## boot approx
## (Intercept) 0.039 0.045
## treatmenttournament 0.051 0.055
## treatmentreserve 0.051 0.055
## room_sizeSmall 0.041 0.045
## err 0.015 0.110
We check after resampling cases
f <- function(data, i, object) {
d <- data[i, ]
lm.res <- update(object, data = d)
c(coef(lm.res), sigma(lm.res))
}
rooms.boot <- boot(f, R = 499, data = rooms, object = rooms.lm)
plot(rooms.boot, index = 2)SE <- apply(rooms.boot$t, 2, sd) # St. errors compared to 0.096 and 0.0285
round(cbind(boot = SE, approx = c(coef(summary(rooms.lm))[, 2],
err = sigma(rooms.lm))), 3)## boot approx
## (Intercept) 0.038 0.045
## treatmenttournament 0.056 0.055
## treatmentreserve 0.051 0.055
## room_sizeSmall 0.044 0.045
## err 0.017 0.110
Another simple model is that the number of entrants \(e_{i}\) for the ith competition style is a binomial random variable with probability \(\pi_{i}\) and size \(n_{i}\). The responses are taken \(y_i = e_i / n_i\) so the mean response is equal to the probability that a competitor enters.
rooms.glm <- glm(cbind(submit, n) ~ treatment + room_size, binomial(logit),
data = rooms)
summary(rooms.glm)##
## Call:
## glm(formula = cbind(submit, n) ~ treatment + room_size, family = binomial(logit),
## data = rooms)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.07182 -0.39862 -0.01323 0.22723 1.14351
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.33075 0.24103 -5.521 3.37e-08 ***
## treatmenttournament 0.22871 0.29815 0.767 0.443
## treatmentreserve 0.02759 0.30920 0.089 0.929
## room_sizeSmall -0.01605 0.25052 -0.064 0.949
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7.9613 on 23 degrees of freedom
## Residual deviance: 7.2401 on 20 degrees of freedom
## AIC: 83.102
##
## Number of Fisher Scoring iterations: 4
For an adequate fit, the deviance should be approximately distributed as Chisq with 23 degrees of freedom. A chisq test gives a pvalue of 0.9958201, indicating under-dispersion relative to the model. This can be due to heterogeneity of probability of entering which is not well accounted for in the aggregated model.
A quasi-binomial model seems to fit better; confirms the under-dispersion; and now the races vs tournament effect is almost significant (one-sided, 10 percent). Again, however, deviance appears very low relative to the theoretical model.
rooms.qb <- glm(cbind(submit, n) ~ treatment + room_size, quasibinomial(logit),
data = rooms)
summary(rooms.qb) # races vs tournament is slightly nonsignificant (one-sided)##
## Call:
## glm(formula = cbind(submit, n) ~ treatment + room_size, family = quasibinomial(logit),
## data = rooms)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.07182 -0.39862 -0.01323 0.22723 1.14351
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.33075 0.14343 -9.278 1.1e-08 ***
## treatmenttournament 0.22871 0.17742 1.289 0.212
## treatmentreserve 0.02759 0.18399 0.150 0.882
## room_sizeSmall -0.01605 0.14907 -0.108 0.915
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 0.3540818)
##
## Null deviance: 7.9613 on 23 degrees of freedom
## Residual deviance: 7.2401 on 20 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
Since the binomial model does not fit well the data, we try modeling entrants in a room as independent poisson variables with mean \(\mu_{ij}\) for the i-th competition style and k-th room size. That is: \[ \mu_{ik} = exp(\alpha_i + \beta_k) \] where \(\alpha_i\) is the competition style and \(\beta\) the roome size. The poisson model might be better than binomial because it allows individual probabilities of entry not be identical. [Need to very it]
The poisson model gives very similar results to the binomial model, including under-dispersion.
rooms.poi <- glm(submit ~ offset(log(n)) + treatment + room_size,
quasipoisson, data = rooms)
summary(rooms.poi)##
## Call:
## glm(formula = submit ~ offset(log(n)) + treatment + room_size,
## family = quasipoisson, data = rooms)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.17879 -0.45198 -0.01118 0.25473 1.31354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.33240 0.14622 -9.112 1.47e-08 ***
## treatmenttournament 0.22843 0.17896 1.276 0.216
## treatmentreserve 0.02776 0.18752 0.148 0.884
## room_sizeSmall -0.01178 0.15052 -0.078 0.938
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 0.4657543)
##
## Null deviance: 10.1993 on 23 degrees of freedom
## Residual deviance: 9.2688 on 20 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
Another way to model under-dispersed data is with the beta-binomial model (following Davison’s book ch. 10). In this case, the probability of entry \(\pi\) has the beta distribution with parameters \(a\) and \(b\). Hence.
The model is \[ E[y] = m \mu,\quad Var(R) = m \mu (1-\mu) + m(m-1)\sigma^2 \] where \(\mu\) and \(\sigma^2\) denote mean and variance of \(\pi\) (according to beta-binomial these are \(a / (a+b)\) and ab/{(a+b)(a+b+1)}).
A binomial model with individual data should give very similar results. Let’s check this intuition
# With individual data
races.glm <- glm(submit ~ treatment + room_size, quasibinomial(logit),
data = races)
summary(races.glm)##
## Call:
## glm(formula = submit ~ treatment + room_size, family = quasibinomial(logit),
## data = races)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8974 -0.7956 -0.7828 1.4861 1.6398
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.02584 0.25207 -4.070 6.05e-05 ***
## treatmenttournament 0.32428 0.31418 1.032 0.303
## treatmentreserve 0.03784 0.32294 0.117 0.907
## room_sizeSmall -0.01660 0.26354 -0.063 0.950
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.013501)
##
## Null deviance: 358.81 on 298 degrees of freedom
## Residual deviance: 357.49 on 295 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
There’s over-dispersion relative to the model, as a chisq test gives a pvalue of 1 - pchisq(races.glm$deviance, races.glm$df.residual). So we fit quasi-binomial model to account for overdispersion.
races.qb <- glm(submit ~ treatment+room_size, quasibinomial(logit), data=races)
summary(races.qb)
The estimated effect of races vs tournament on the probability of entry is larger but nonsignificant.
We check different link functions like probit and clolog that give very similar results.
summary(races.probit <- glm(submit ~ treatment + room_size, quasibinomial(probit),
data = races))##
## Call:
## glm(formula = submit ~ treatment + room_size, family = quasibinomial(probit),
## data = races)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8979 -0.7960 -0.7832 1.4855 1.6407
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.63062 0.15003 -4.203 3.49e-05 ***
## treatmenttournament 0.19551 0.18904 1.034 0.302
## treatmentreserve 0.02244 0.19216 0.117 0.907
## room_sizeSmall -0.01185 0.15823 -0.075 0.940
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.013503)
##
## Null deviance: 358.81 on 298 degrees of freedom
## Residual deviance: 357.49 on 295 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
summary(races.cloglog <- glm(submit ~ treatment + room_size,
quasibinomial(cloglog), data = races))##
## Call:
## glm(formula = submit ~ treatment + room_size, family = quasibinomial(cloglog),
## data = races)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8966 -0.7949 -0.7821 1.4870 1.6383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.18478 0.21663 -5.469 9.66e-08 ***
## treatmenttournament 0.27338 0.26536 1.030 0.304
## treatmentreserve 0.03261 0.27772 0.117 0.907
## room_sizeSmall -0.00957 0.22307 -0.043 0.966
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.013514)
##
## Null deviance: 358.81 on 298 degrees of freedom
## Residual deviance: 357.49 on 295 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
It’s clear that the probability of entry is not constant among competitors but depends on ability. So we assume:
\[ \pi = \beta_{0i} + \beta_{1j} + \beta_{2} \text{skill rating}_i \]
Again we use quasibinomial to account for overdispersion.
races$rating.imp <- impute(races$rating, "zero")
races.glm <- glm(submit ~ treatment + room_size + log(nreg),
quasibinomial(logit), data = races)
summary(races.glm)##
## Call:
## glm(formula = submit ~ treatment + room_size + log(nreg), family = quasibinomial(logit),
## data = races)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3336 -0.8449 -0.6520 1.1478 2.1031
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.09547 0.36753 -5.701 2.89e-08 ***
## treatmenttournament 0.32962 0.32620 1.010 0.313
## treatmentreserve -0.04902 0.33492 -0.146 0.884
## room_sizeSmall 0.06018 0.27408 0.220 0.826
## log(nreg) 0.47244 0.10678 4.424 1.36e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.012135)
##
## Null deviance: 358.81 on 298 degrees of freedom
## Residual deviance: 335.79 on 294 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
Mmmhhh… perhaps I should look at number of submissions… instead of participation.
subm <- aggregate(nsub ~ n + room_id + room_size + treatment,
data = races, sum)
par(mfrow = c(1, 2))
boxplot(nsub/n ~ room_size, subm, horizontal = TRUE)
boxplot(nsub/n ~ treatment, subm, horizontal = TRUE)Testing differences between threshold (race & reserve) and tournament. Bootstrap
# OLS
subm.lm <- rlm(nsub/n ~ treatment + room_size, data = subm)
summary(subm.lm)##
## Call: rlm(formula = nsub/n ~ treatment + room_size, data = subm)
## Residuals:
## Min 1Q Median 3Q Max
## -5.01364 -1.59091 -0.07557 2.00992 6.86249
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 6.7980 1.2891 5.2736
## treatmenttournament 1.0823 1.5788 0.6855
## treatmentreserve 1.6155 1.5788 1.0233
## room_sizeSmall -1.6605 1.2891 -1.2881
##
## Residual standard error: 2.81 on 20 degrees of freedom
# OLS with log
subm.lm.log <- glm(log(nsub/n) ~ treatment + room_size, data = subm)
summary(subm.lm.log)##
## Call:
## glm(formula = log(nsub/n) ~ treatment + room_size, data = subm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.26396 -0.23919 0.06036 0.27676 0.95858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7585 0.2185 8.048 1.06e-07 ***
## treatmenttournament 0.2327 0.2676 0.870 0.395
## treatmentreserve 0.3517 0.2676 1.314 0.204
## room_sizeSmall -0.2322 0.2185 -1.063 0.301
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2864534)
##
## Null deviance: 6.5646 on 23 degrees of freedom
## Residual deviance: 5.7291 on 20 degrees of freedom
## AIC: 43.729
##
## Number of Fisher Scoring iterations: 2
# Poisson
subm.poi <- glm(nsub ~ log(offset(n)) + treatment + room_size,
poisson, data = subm)
summary(subm.poi)##
## Call:
## glm(formula = nsub ~ log(offset(n)) + treatment + room_size,
## family = poisson, data = subm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.7474 -2.3714 0.2507 2.5366 6.2816
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.659164 4.866811 -3.012 0.00259 **
## log(offset(n)) 7.141301 1.799365 3.969 7.22e-05 ***
## treatmenttournament 0.001948 0.055424 0.035 0.97197
## treatmentreserve 0.156722 0.053508 2.929 0.00340 **
## room_sizeSmall 2.369320 0.738567 3.208 0.00134 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 535.99 on 23 degrees of freedom
## Residual deviance: 332.97 on 19 degrees of freedom
## AIC: 491.26
##
## Number of Fisher Scoring iterations: 4
subm.qpoi <- glm(nsub ~ log(offset(n)) + treatment + room_size,
quasipoisson, data = subm)
summary(subm.qpoi)##
## Call:
## glm(formula = nsub ~ log(offset(n)) + treatment + room_size,
## family = quasipoisson, data = subm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.7474 -2.3714 0.2507 2.5366 6.2816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.659164 19.642887 -0.746 0.465
## log(offset(n)) 7.141301 7.262399 0.983 0.338
## treatmenttournament 0.001948 0.223698 0.009 0.993
## treatmentreserve 0.156722 0.215965 0.726 0.477
## room_sizeSmall 2.369320 2.980924 0.795 0.437
##
## (Dispersion parameter for quasipoisson family taken to be 16.29002)
##
## Null deviance: 535.99 on 23 degrees of freedom
## Residual deviance: 332.97 on 19 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
When we turn to the individual propensity to submit, we strongly reject differences in the propensity to submit between large and small rooms. No evidence of competition styles effects.
# Submit and room size
tab.size <- with(races, table(submit, room_size))
fisher.test(tab.size) # p = 0.98##
## Fisher's Exact Test for Count Data
##
## data: tab.size
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.5689643 1.6912635
## sample estimates:
## odds ratio
## 0.9846648
# Submit and competition styles
tab <- with(races, table(submit, treatment))
fisher.test(tab) # p = 0.52##
## Fisher's Exact Test for Count Data
##
## data: tab
## p-value = 0.5255
## alternative hypothesis: two.sided
# Competition styles with target
tab <- with(races, table(submit, treatment == "tournament"))
fisher.test(tab, alternative = "greater") # p=0.15##
## Fisher's Exact Test for Count Data
##
## data: tab
## p-value = 0.1558
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 0.8435475 Inf
## sample estimates:
## odds ratio
## 1.355339
If we assume uniform priors, we can easily compute the posteriors in each treatment. (Not sure what we can do with this result.)
# Bayesian analysis Compute posteriors Beta(1 + success, 1 +
# n - success)
plot.posterior <- function(x, ...) {
tab <- with(races, table(submit, treatment))
param <- data.frame(tab + 1)
index <- param$treatment == x
shape <- param[index, ]
curve(dbeta(x, shape$Freq[2], shape$Freq[1]), ...)
}
# PLOTs
plot.posterior("race", from = 0.1, to = 0.5, xlab = "propensity to submit",
ylab = "posterior density")
plot.posterior("tournament", add = TRUE, lty = 2)
plot.posterior("reserve", add = TRUE, lty = 3)
title("Beta(y + alpha, n-y + beta)")
legend("topright", legend = levels(races$treatment), lty = 1:3,
bty = "n")Impute missing values at random for the survey. Ignore missing ratings (or impute them).
dat.imp <- within(dat, {
set.seed(25978)
rating.100 <- rating/100
rated <- !is.na(rating)
rating.100imp <- impute(rating/100, "zero")
hours.imp <- impute(hours, "random")
risk.imp <- impute(risk, "random")
grad.imp <- impute(grad, "random")
male.imp <- impute(male, "random")
below30.imp <- impute(below30, "random")
timezone.imp <- impute(timezone, "random")
expert <- cut(submissions, quantile(submissions[!is.na(rating)]),
include = TRUE)
})## Error in within(dat, {: object 'dat' not found
Fitting the model
# First model has no covariates (i.e., $\gamma=0$)
summary(m0 <- glm(submit ~ treatment, binomial(logit), data = dat.imp))## Error in is.data.frame(data): object 'dat.imp' not found
# The second model adds the main skill rating measure which
# is available for 2/3 of our population. It seemed better to
# rescale rating in 100-point units and center it on the
# median value. Thus, the estimate of the intercept can be
# easily transformed in the probability of participation of
# the median rated individual assigned to a room with a race
# competition style.
summary(m1 <- update(m0, " ~ . + rating.100"))## Error in update(m0, " ~ . + rating.100"): object 'm0' not found
summary(m1bis <- update(m0, " ~ . + rating.100imp + rated"))## Error in update(m0, " ~ . + rating.100imp + rated"): object 'm0' not found
# The third column adds time availability (hours)
summary(m2 <- update(m1bis, " ~ . + hours.imp"))## Error in update(m1bis, " ~ . + hours.imp"): object 'm1bis' not found
# ... add demographics to m2
summary(m3 <- update(m2, " ~ . + timezone.imp + grad.imp + below30.imp + male.imp"))## Error in update(m2, " ~ . + timezone.imp + grad.imp + below30.imp + male.imp"): object 'm2' not found
# ... add risk aversion
summary(m4 <- update(m2, " ~ . + risk.imp"))## Error in update(m2, " ~ . + risk.imp"): object 'm2' not found
# ... add everything stepwise regression
summary(m5 <- step(update(m3, " ~ . + risk.imp")))## Error in update(m3, " ~ . + risk.imp"): object 'm3' not found
# Compare models
models <- list(m0, m1, m1bis, m2, m3, m4, m5)## Error in eval(expr, envir, enclos): object 'm0' not found
stargazer(models, type = "text", digits = 2)## Error in .summary.object$r.squared: $ operator is invalid for atomic vectors
Explore accuracy of predictions of the stepwise model
# Compare models in terms of prediction accuracy
accuracy <- function(fit) {
yhat <- predict(fit, type = "response")
tab <- table(predicted = ifelse(yhat > 0.5, 1, 0), actual = fit$y)
tp <- tab[2, 2]
fp <- tab[2, 1]
fn <- tab[1, 2]
precision <- tp/(tp + fp)
recall <- tp/(tp + fn)
fmeasure <- 2 * precision * recall/(precision + recall)
list(conf.table = tab, precision = precision, recall = recall,
f.measure = fmeasure)
}
accuracy(m5)## Error in predict(fit, type = "response"): object 'm5' not found
summarize.fit <- function(x) {
yhat <- predict(x)
with(x, plot(jitter(y) ~ yhat, col = ifelse(yhat > 0, "brown",
"blue"), pch = 16, xlab = "ability ~ Logistic"))
curve(ilogit, add = T)
}
par(mfrow = c(3, 3))
sapply(models, summarize.fit)## Error in UseMethod("predict"): no applicable method for 'predict' applied to an object of class "NULL"
Study interaction effecs first with models on a subset and then using interaction terms in the regression.
# Using Akaike criterion to select the best model, we now
# compute the model for each treatment alone
summary(race <- update(m5, "~ . -treatment", subset = treatment ==
"race"))## Error in update(m5, "~ . -treatment", subset = treatment == "race"): object 'm5' not found
summary(tour <- update(m5, "~ . -treatment", subset = treatment ==
"tournament"))## Error in update(m5, "~ . -treatment", subset = treatment == "tournament"): object 'm5' not found
summary(rese <- update(m5, "~ . -treatment", subset = treatment ==
"reserve"))## Error in update(m5, "~ . -treatment", subset = treatment == "reserve"): object 'm5' not found
# Interact on stepwise model
summary(interact <- update(m5, "~ (.)*treatment"))## Error in update(m5, "~ (.)*treatment"): object 'm5' not found
summary(stepwise <- step(interact))## Error in terms(object): object 'interact' not found
# Interaction on full model
summary(interact.full <- update(m4, "~ (.)*treatment"))## Error in update(m4, "~ (.)*treatment"): object 'm4' not found
summary(stepwise.full <- step(interact.full))## Error in terms(object): object 'interact.full' not found
models <- list(m5, race, tour, rese, interact, stepwise, interact.full,
stepwise.full)## Error in eval(expr, envir, enclos): object 'm5' not found
stargazer(models, type = "text", digits = 3)## Error in .summary.object$r.squared: $ operator is invalid for atomic vectors
par(mfrow = c(3, 3))
sapply(models, summarize.fit)## Error in UseMethod("predict"): no applicable method for 'predict' applied to an object of class "NULL"
# This identification result holds under the assumption that
# the skill distribution is logistic. Next, we relax this
# distributional assumption. First, we check different
# distributions [e.g., probit, cauchit, cloglog]. We find
# Probit model seems slightly better in terms of deviance
probit <- update(m5, family = binomial(probit))## Error in update(m5, family = binomial(probit)): object 'm5' not found
cauchit <- update(m5, family = binomial(cauchit))## Error in update(m5, family = binomial(cauchit)): object 'm5' not found
cloglog <- update(m5, family = binomial(cloglog))## Error in update(m5, family = binomial(cloglog)): object 'm5' not found
fit <- list(logit = m5, probit = probit, cauchit = cauchit, cloglog = cloglog)## Error in eval(expr, envir, enclos): object 'm5' not found
cbind(deviance = sapply(fit, deviance), n = sapply(fit, function(x) length(x$y)))## Error in lapply(X = X, FUN = FUN, ...): object 'fit' not found
# ... but not necessarily in terms of prediction accuracy
accuracy(probit)## Error in predict(fit, type = "response"): object 'probit' not found
accuracy(m5)## Error in predict(fit, type = "response"): object 'm5' not found
We try generalized additive models
# GAMs Generalized additive models
m.gam <- mgcv::gam(submit ~ treatment + s(hours.imp) + s(rating.100imp),
data = dat.imp, family = binomial)## Error in is.data.frame(data): object 'dat.imp' not found
summary(m.gam)## Error in summary(m.gam): object 'm.gam' not found
plot(m.gam)## Error in plot(m.gam): object 'm.gam' not found
And non-parametric models like KS and Ichimura.
require(np)
# Ichimura
m.np <- npindexbw(as.numeric(submit) ~ rating.100imp + hours.imp +
treatment, data = dat.imp)## Error in npindexbw(as.numeric(submit) ~ rating.100imp + hours.imp + treatment, : object 'dat.imp' not found
summary(m.np)## Error in summary(m.np): object 'm.np' not found
plot(m.np)## Error in plot(m.np): object 'm.np' not found
# Klein-spady ... Not sure about this implementation!
kleinspad <- npindexbw(as.numeric(submit) ~ rating.100imp + hours.imp +
treatment, data = dat.imp, method = "kleinspady")## Error in npindexbw(as.numeric(submit) ~ rating.100imp + hours.imp + treatment, : object 'dat.imp' not found
summary(kleinspad)## Error in summary(kleinspad): object 'kleinspad' not found
plot(kleinspad)## Error in plot(kleinspad): object 'kleinspad' not found
# Bayesian models model <- 'submit ~ treatment + tothours.imp
# + mm_rating.100 + educ+gender+timezone' bayes <-
# arm::bayesglm(model, family=binomial) summary(bayes)The distribution of final scores shows a few values much smaller than the rest of data points. These outliers come from submissions that either failed to compile while testing, thus returning a null value, or returned extremely small values for some other reason. As submissions could not be much smaller than the baseline…
plot.scores.time <- function(...) {
scores <- scores[order(scores$submission), ] # Order scores by time
index <- tapply(1:nrow(scores), scores$coder_id, tail, 1) # last submission
scores <- scores[index, ]
scores <- scores[order(scores$timestamp), ] # order by time
scores$final[is.na(scores$final)] <- 0.792867 # Impute 2 missing scores
plot(final/1e+06 ~ timestamp, data = scores, type = "l",
...)
abline(h = 0.792867, lty = 3) # Baseline
abline(h = 0.817866, lty = 2, col = 2) # Target
}
plot.scores.time(col = "lightgray", xlab = "Time of submission (days)",
ylab = "Final score")Scores over time
We drop these outliers in regression, which is somewhat equivalent to using one-way winsorized group means in regression.
# https://en.wikipedia.org/wiki/Winsorized_mean cap.scores <-
# function(x, threshold) ifelse(x<threshold, threshold, x)
# cap.01 <- quantile(scores$final, na.rm=TRUE, p=0.1)
# scores$final.cap <- cap.scores(scores$final,
# threshold=cap.01) scores$final.cap2 <-
# cap.scores(scores$final, threshold=792867) # Baselinerequire(contest)
elastic <- c(-1, 2, -2)
n <- 10
# Study tournament
x.100 <- seq(0.01,0.99, length=100)
t4 <- contest(x=x.100, n=n, type='tournament', deadline=1, target=0.2, elasticity=elastic)
par(mfrow=c(3,3))
plot(t4, pch=16, cex=0.5)
# Create a large matrix of abilities
n <- 50
a <- matrix(runif(100*n), 100, n)
# Simulations
ystar <- apply(a, 1, function(x) max(contest(x, n=10, type="tournament", deadline=1, target=0.5)$score))
tstar <- apply(a, 1, function(x) max(contest(x, n=10, type="tournament", deadline=1, target=0.5)$timing))
mean(ystar) - 0.5 * mean(tstar)
# Optimize target in a tournament
tournament <- function(x, y) contest(x, n=length(x), target=y, type="tournament")$score
h <- apply(a, 1, function(x) max(tournament(x, 0.1)))
f <- function(x, data) {
ystar <- apply(data, 1, function(x)
max()
tstar <- apply(data, 1, function(x)
max(contest(x, n=ncol(data), type="tournament", target=x)$timing))
- mean(ystar) + mean(tstar)
}
optimize(f, interval=c(0, 1), n=10) # Optimal target 0.5716364
# Optimize number of competitors in a tournament
f <- function(x, R=99, elastic=c(-1, 2, -2)) {
n <- as.integer(x)
rp <- replicate(R, max(contest(x=runif(n), n=n, type='tournament', deadline=1, target=0.5)$score))
- mean(rp)
}
optimize(f, interval=c(2, 50)) # Optimal number of competitors
# Optimize with concerns for timing
f <- function(x, R=99, elastic=c(-1, 2, -2)) {
n <- as.integer(x)
rp <- replicate(R, max(contest(x=runif(n), n=n, type='tournament', deadline=1, target=0.5)$score))
- mean(rp) + mean()
}
optimize(f, interval=c(2, 50)) # Optimal number of competitors
data(scores)
# Create panel
create.panel <- function(z, id) {
diffmin <- function(x) x - min(x)
z$date <- z$timestamp %>% as.Date %>% as.numeric %>% diffmin
z$y <- 1
g <- expand.grid(date = seq(min(z$date), max(z$date)), coder_id = id,
y = 0)
g2 <- rbind(g, z[, c("date", "coder_id", "y")])
g2$date <- ifelse(g2$date < 3, 1, ifelse(g2$date < 5, 2,
ifelse(g2$date < 7, 3, 4)))
aggregate(y ~ date + coder_id, data = g2, FUN = sum)
}
scores.panel0 <- create.panel(scores, races$coder_id)
scores.panel <- merge(scores.panel0, races, by = "coder_id")
# Unconditional semi-parametric [Odds(first) = exp(alpha +
# beta * t )]
scores.panel$submit <- scores.panel$y > 0
clog <- clogit(submit ~ treatment + strata(date2), data = scores.panel)## Error in strata(date2): object 'date2' not found
summary(clog)## Error in summary(clog): object 'clog' not found
# Logistic
m.log <- glm(submit ~ treatment + factor(date), binomial(logit),
data = scores.panel)
summary(m.log)##
## Call:
## glm(formula = submit ~ treatment + factor(date), family = binomial(logit),
## data = scores.panel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6938 -0.6138 -0.5635 -0.5287 2.0297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.78616 0.19861 -8.994 <2e-16 ***
## treatmenttournament 0.13586 0.19251 0.706 0.480
## treatmentreserve 0.02642 0.19592 0.135 0.893
## factor(date)2 -0.13734 0.23461 -0.585 0.558
## factor(date)3 0.07646 0.22583 0.339 0.735
## factor(date)4 0.34873 0.21687 1.608 0.108
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1057.1 on 1195 degrees of freedom
## Residual deviance: 1051.3 on 1190 degrees of freedom
## AIC: 1063.3
##
## Number of Fisher Scoring iterations: 4
## Add rating
clog2 <- clogit(submit ~ treatment + I(mm_rating/100) + strata(date),
data = scores.panel)## Error in unique(c("AsIs", oldClass(x))): object 'mm_rating' not found
summary(clog2)## Error in summary(clog2): object 'clog2' not found
# Add week hours
hours <- 0
for (i in 1:4) {
w <- with(scores.panel, switch(i, `1` = week1, `2` = week2,
`3` = week3, `4` = week4))
index <- scores.panel$date == i
hours[index] <- w[index]
}
clog3 <- clogit(submit ~ treatment + hours + strata(date), data = scores.panel)
summary(clog3)## Call:
## coxph(formula = Surv(rep(1, 1196L), submit) ~ treatment + hours +
## strata(date), data = scores.panel, method = "exact")
##
## n= 1108, number of events= 179
## (88 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## treatmenttournament 0.062484 1.064477 0.202002 0.309 0.75708
## treatmentreserve 0.106070 1.111899 0.201006 0.528 0.59771
## hours 0.008063 1.008096 0.002791 2.889 0.00386 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## treatmenttournament 1.064 0.9394 0.7165 1.582
## treatmentreserve 1.112 0.8994 0.7498 1.649
## hours 1.008 0.9920 1.0026 1.014
##
## Rsquare= 0.007 (max possible= 0.577 )
## Likelihood ratio test= 7.97 on 3 df, p=0.04671
## Wald test = 8.56 on 3 df, p=0.03582
## Score (logrank) test = 8.94 on 3 df, p=0.03012
clog4 <- clogit(submit ~ treatment + I(mm_rating/100) + hours +
strata(date), data = scores.panel)## Error in unique(c("AsIs", oldClass(x))): object 'mm_rating' not found
summary(clog4)## Error in summary(clog4): object 'clog4' not found
# Compare models
models <- list(clog, clog2, clog3, clog4, m.log)## Error in eval(expr, envir, enclos): object 'clog' not found
stargazer(models, type = "text")## Error in .summary.object$r.squared: $ operator is invalid for atomic vectors
We transform data in a panel with the date of the first submission. The probability of entry in a given date is modeled with a conditional logit. We find that the probability of entry in a given date decreases over time. (if it was at random 1/8 chances of having a submission in a given date). The estimated drop is about 13 percent from one date to the next (a reduction of above 100% from the first to the last date). Using interactions with treatment dummies, we find that the decrease in the probability of entry is negative in all treatments but it is about 25 percent in the race (with pvalue<0.05) it is only 8 (pval) and 6 (pval) percent for tournaments and reserve, respectively.
# Create variable
dmin <- function(x) x - min(x)
scores$date <- scores$timestamp %>% as.Date %>% as.numeric %>%
dmin
create.panel <- function(data) {
data$submit_first <- 1
g <- with(data, expand.grid(date = seq(min(date), max(date)),
coder_id = unique(coder_id), submit_first = 0))
g2 <- rbind(g, data[, c("date", "coder_id", "submit_first")])
aggregate(submit_first ~ date + coder_id, g2, FUN = sum)
}
scores.panel <- create.panel(subset(scores, submission == 1))
z <- merge(scores.panel, races, by = "coder_id")
# Unconditional semi-parametric [Odds(first) = exp(alpha +
# beta * t )]
fit <- rep()
fit$clog <- clogit(submit_first ~ date, data = z)
summary(fit$clog)## Call:
## coxph(formula = Surv(rep(1, 774L), submit_first) ~ date, data = z,
## method = "exact")
##
## n= 774, number of events= 86
##
## coef exp(coef) se(coef) z Pr(>|z|)
## date -0.1510 0.8598 0.0461 -3.275 0.00106 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## date 0.8598 1.163 0.7855 0.9412
##
## Rsquare= 0.014 (max possible= 0.498 )
## Likelihood ratio test= 11.17 on 1 df, p=0.0008303
## Wald test = 10.73 on 1 df, p=0.001055
## Score (logrank) test = 11.02 on 1 df, p=0.0008998
# Unconditional parametric [Odds(first) = exp(alpha + beta *
# t )]
fit$logi <- glm(submit_first ~ date, data = z, binomial(logit))
summary(fit$logi)##
## Call:
## glm(formula = submit_first ~ date, family = binomial(logit),
## data = z)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6252 -0.5441 -0.4394 -0.3801 2.3687
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.53318 0.18853 -8.132 4.21e-16 ***
## date -0.15121 0.04614 -3.277 0.00105 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 539.99 on 773 degrees of freedom
## Residual deviance: 528.81 on 772 degrees of freedom
## AIC: 532.81
##
## Number of Fisher Scoring iterations: 5
# Unconditional with interaction
fit$logi2 <- glm(submit_first ~ date + date:treatment, data = z,
binomial(logit))
summary(fit$logi2)##
## Call:
## glm(formula = submit_first ~ date + date:treatment, family = binomial(logit),
## data = z)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6267 -0.5273 -0.4601 -0.3801 2.5737
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.52808 0.18858 -8.103 5.36e-16 ***
## date -0.21835 0.07201 -3.032 0.00243 **
## date:treatmenttournament 0.08546 0.07472 1.144 0.25273
## date:treatmentreserve 0.09344 0.07689 1.215 0.22423
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 539.99 on 773 degrees of freedom
## Residual deviance: 526.96 on 770 degrees of freedom
## AIC: 534.96
##
## Number of Fisher Scoring iterations: 5
# Note the difference when the intercept is not there
# Conditional [Odds(first) = exp(alpha_i + beta * t )]
fit$coclog <- clogit(submit_first ~ date + strata(coder_id),
data = z)
summary(fit$coclog)## Call:
## coxph(formula = Surv(rep(1, 774L), submit_first) ~ date + strata(coder_id),
## data = z, method = "exact")
##
## n= 774, number of events= 86
##
## coef exp(coef) se(coef) z Pr(>|z|)
## date -0.1340 0.8746 0.0433 -3.095 0.00197 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## date 0.8746 1.143 0.8034 0.9521
##
## Rsquare= 0.013 (max possible= 0.386 )
## Likelihood ratio test= 9.93 on 1 df, p=0.001627
## Wald test = 9.58 on 1 df, p=0.00197
## Score (logrank) test = 9.81 on 1 df, p=0.001735
# Interactions with treatment dummies [Odds(first) =
# exp(alpha_i + beta_i * t )]
fit$coclog2 <- clogit(submit_first ~ date:treatment + strata(coder_id),
data = z)
summary(fit$coclog2)## Call:
## coxph(formula = Surv(rep(1, 774L), submit_first) ~ date:treatment +
## strata(coder_id), data = z, method = "exact")
##
## n= 774, number of events= 86
##
## coef exp(coef) se(coef) z Pr(>|z|)
## date:treatmentrace -0.27991 0.75585 0.08801 -3.180 0.00147 **
## date:treatmenttournament -0.08726 0.91644 0.06847 -1.274 0.20253
## date:treatmentreserve -0.06708 0.93512 0.07522 -0.892 0.37256
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## date:treatmentrace 0.7559 1.323 0.6361 0.8982
## date:treatmenttournament 0.9164 1.091 0.8013 1.0481
## date:treatmentreserve 0.9351 1.069 0.8069 1.0837
##
## Rsquare= 0.018 (max possible= 0.386 )
## Likelihood ratio test= 14.16 on 3 df, p=0.002689
## Wald test = 12.53 on 3 df, p=0.005761
## Score (logrank) test = 13.61 on 3 df, p=0.003487
# Note: the above model is the same as clogit(submit_first ~
# date + date:treatment + strata(coder_id), data=z)
# Coefficients
sapply(fit, coef)## $clog
## date
## -0.1510078
##
## $logi
## (Intercept) date
## -1.5331826 -0.1512076
##
## $logi2
## (Intercept) date date:treatmenttournament
## -1.52808481 -0.21835013 0.08545552
## date:treatmentreserve
## 0.09344282
##
## $coclog
## date
## -0.1339918
##
## $coclog2
## date:treatmentrace date:treatmenttournament date:treatmentreserve
## -0.27990753 -0.08725850 -0.06707555
# Days as factors
# # Compute first submission time time.max <- '2015-03-16
# 11:59:59 EDT' time.min <- '2015-03-08 12:00:00 EDT'
# time.censored <- 192 fsutime <-
# as.numeric(difftime(races.sub$timestamp, time.min ,
# unit='hours')) dat <- with(races.sub, aggregate(fsutime ~
# handle, FUN=min)) dat$fsu <- 1 dat2 <- merge(dat, races,
# by='handle', all=TRUE) dat2$fsu[is.na(dat2$fsu)] <- 0
# dat2$fsutime[is.na(dat2$fsutime)] <- time.censored surv <-
# rep() surv$m1 <- survreg(Surv(fsutime, fsu) ~ 1, dat2,
# dist='weibull', scale=1) surv$m2 <- survreg(Surv(fsutime,
# fsu) ~ treatment + mmevents, dat2 , dist='weibull',
# scale=1) surv$m3 <- survreg(Surv(fsutime, fsu) ~ treatment
# + mmevents, dat2, dist='exponential') surv$tobin <-
# survreg(Surv(fsutime, fsu, type='left') ~ treatment +
# mmevents, data=dat2, dist='gaussian') stargazer(surv,
# type='text') # A model with different baseline survival
# shapes for two groups, i.e., # two different scale
# parameters survreg(Surv(time, status) ~ ph.ecog + age +
# strata(sex), lung) # There are multiple ways to
# parameterize a Weibull distribution. The survreg # function
# embeds it in a general location-scale family, which is a #
# different parameterization than the rweibull function, and
# often leads # to confusion. # survreg's scale =
# 1/(rweibull shape) # survreg's intercept = log(rweibull
# scale) # For the log-likelihood all parameterizations lead
# to the same value. y <- rweibull(1000, shape=2, scale=5)
# survreg(Surv(y)~1, dist='weibull') # Economists fit a model
# called `tobit regression', which is a standard # linear
# regression with Gaussian errors, and left censored data.
# tobinfit <- survreg(Surv(durable, durable>0, type='left') ~
# age + quant, data=tobin, dist='gaussian')In binomial linear models, the probability function \(p(x, \theta)\) is related to a linear prediction through a one-to-one transformation called the link function (see XXXX). [Let see if our model is linear]
Competitors have a latent ability variable \(a^*\) drawn from a common distribution \(F_l(\cdot)\) (distributions may differ across rooms).
At equilibrium, \(Y_{l} = 1 \iff a^* \geq a_{0l}\) where \(a_{0l}\) is the marginal type for the room in room \(l\).
Zero profit condition (revenues = costs) is
\[ R(a; F_l) = C(a, y_{0,l}, t_{0,l}) \]
First, we rewrite the probability \(p(x, \theta)\) in the following way:
\[ p(x_l, \theta) = 1 - \theta_1 x_l^{\theta_2} \]
where we denote the (observed) target by \(x_l\) and the vector of parameters by \(\theta=(\theta_1, \theta_2)\) where \(\theta_1 \equiv m_l^{\alpha/(n_l-1)}\) and \(\theta_2 \equiv \beta/(n_l-1)\). Assume that \(n_l=n\) for each \(l=1,...,L\).
Let \(Y_1, Y_2, ..., Y_L\) be the count of participants generated by our binomial-distribution model. The likelihood is:
\[ \mathcal{L} = \prod_{l=1}^L \binom{n_l}{y_l} p(x_l, \theta)^{y_l} [1-p(x_l, \theta)]^{n_l - y_l} \]
The log-likelihood is:
\[ ll = \sum_{l=1}^L y_l\log(p(x_l, \theta)) + (n_l - y_l) \log(1-p(x_l, \theta)) \]
where we omit the binomial coefficient as it is a constant that does not affect estimation.
Using our model into the aboev eqution:
\[ ll = \sum_{l=1}^L y_l\log(1 - \theta_1 x_l^{\theta_2}) + (n - y_l) [\log(\theta_1) + \theta_2\log(x_l)] \]
Let the summation be denoted by \(\sum y_l = \bar y\). We can rewrite the ll expression:
\[ ll = (L n - \bar y)\log(\theta_1) + \sum_{l=1}^L y_l\log(1 - \theta_1 x_l^{\theta_2}) + (n - y_l) \theta_2\log(x_l) \]
First order conditions are:
\[ \frac{\partial ll}{\partial \theta_1} = \frac{(L n - \bar y)}{\theta_1} - \sum_{l=1}^L y_l \frac{x_l^{\theta_2}}{1 - \theta_1 x_l^{\theta_2}} = 0 \]
\[ \frac{\partial ll}{\partial \theta_2} = \sum_{l=1}^L \left[- y_l \frac{x_l^{\theta_2}\theta_1 \log(x_l)}{1 - \theta_1 x_l^{\theta_2}} + (n - y_l) \log(x_l) \right]= 0. \]
Try with simulations
n <- 10
nsize <- 199
theta1 <- 0.75
theta2 <- 0.05
params <- c(theta1, theta2)
x <- runif(nsize)
p <- 1 - theta1 * x^theta2
# hist(prob, 100)
loglik <- function(theta, data) {
theta1 <- theta[1]
theta2 <- theta[2]
x <- data$x
y <- data$y
n <- data$n
p <- 1 - theta1 * x^theta2
out <- dbinom(y, n, p, log = T)
return(-sum(out))
}
# Simulate data
y <- rbinom(nsize, size = n, prob = p)
# Example: loglik(c(0.5, 0.9), data.frame(y, x, n))
# MLE estimation
dat <- data.frame(y, x, n)
theta.start <- runif(2)
mle.est <- optim(theta.start, loglik, data = dat)
thetahat <- mle.est$par
# Replications
out <- replicate(1000, {
y <- rbinom(nsize, size = n, prob = p)
dat <- data.frame(y, x, n)
mle.est <- optim(theta.start, loglik, data = dat)$par
})
boxplot(t(out))
abline(h = params, col = 2)truth <- function(x) 1 - params[1] * x^params[2]
# Now I can 'predictions' on what would happen with different
# 'x' distribution
prediction <- function(x) 1 - thetahat[1] * x^thetahat[2]
# Compare with logistic regression
N <- rep(n, nsize)
logistic <- glm(cbind(y, N) ~ x, family = binomial(logit))
prediction.logistic <- function(x) {
yhat <- coef(logistic)[1] + coef(logistic)[2] * x
exp(yhat)/(1 + exp(yhat))
}
# FIGURE
curve(truth, xlab = "Room covariate", ylim = c(0, 1))
curve(prediction, add = TRUE, lty = 2)
curve(prediction.logistic(x), add = TRUE, lty = 3)
legend("topright", c("Truth", "Estimated", "Logistic"), lty = 1:3)Imagine that the distribution \(F_\theta\) is uniform on \((0, \theta)\) and the cost ability \(c_a(x) = 1/x\). Then the zero profit condition becomes
\[\begin{align} \left(\frac{m}{\theta}\right)^{n-1} m = \beta x_l \end{align}\]Solving for the marginal type gives:
\[\begin{align} m = \left(\beta x_l \theta^{n-1}\right)^{1/n} \end{align}\]So, the probability of participation becomes
\[\begin{align} p(x_l, \theta) & = 1 - F_\theta(m) \\ & = 1 - \frac{\left(\beta x_l \theta^{n-1}\right)^{1/n} }{\theta} \\ & = 1 - \left(\frac{\beta x_l}{\theta}\right)^{1/n}. \end{align}\]Because it is non-linear in its parameters, the model is not identifiable [need proof]. However, we can re-parametrize the model with \(\beta/\theta \equiv \eta\) to make the parameter \(\eta\) identifiable.
Simulated distribution of participants with uniform distribution. Parameters are \(\eta=0.1\) and \(n=15\). Data simulated 500 times. Dashed curve had higher costs (\(x_l=1.5\)) than the solid curve (\(x_l=0.5\)).
In this example, we assume abilities are drawn from a lognormal distribution with parameters \(\mu\) and \(\sigma\).
Our aim is to estimate the parameters \(\theta\) and \(\beta\) and to evaluate whether costs are different between the three competition modes under study: race, tournament, tournament + requirement.
It is natural to estimate the parameters \(\theta\) by maximum likelihood because the ability distribution (and therefore the distribution of the outcomes) is known. The estimation criterion used here is the maximization of the deviance.
The deviance is: \[\begin{equation} D(\theta) = -2 \sum Y \log (\frac{N p(x, \theta)}{Y}) + (N - Y) \log ( \frac{N - N p}{N - Y}). \end{equation}\]Note that the deviance is a function of the likelihood (see xxx pg. xxx).
# Define likelihood
structreg <- function(x, y, wt = rep(1, length(y)), intercept = TRUE,
start = rep(0, p), ...) {
fmin <- function(beta, X, y, w) {
p <- 1 - (beta %*% X)^(1/n) # Function of parameters
-sum(2 * w * ifelse(y, log(p), log(1 - p)))
}
# gmin <- function(beta, X, y, w) { # Gradient here! }
if (is.null(dim(x)))
dim(x) <- c(length(x), 1)
dn <- dimnames(x)[[2]]
if (!length(dn))
dn <- paste("Var", 1:ncol(x), sep = "")
p <- ncol(x) + intercept
if (intercept) {
x <- cbind(1, x)
dn <- c("(Intercept)", dn)
}
if (is.factor(y))
y <- (unclass(y) != 1)
# fit <- optim(start, fmin, gmin, X = x, y = y, w = wt,
# method = 'BFGS', ...)
fit <- optim(0.5, fmin, X = x, y = y, w = wt, method = "BFGS",
...)
names(fit$par) <- dn
cat("\nCoefficients:\n")
print(fit$par)
cat("\nResidual Deviance:", format(fit$value), "\n")
cat("\nConvergence message:", fit$convergence, "\n")
invisible(fit)
}
# Create data
ilogit <- function(x) exp(x)/(1 + exp(x))
n <- 50000
competitors <- 5
x1 <- rchisq(competitors, df = 3)
x2 <- runif(n)
x3 <- rnorm(n)
X <- cbind(rep(1, length(x1)), x1, x2, x3)
betas <- c(-2.5, 0.1, 0.25, 0.5)
fc <- ilogit(X %*% betas) ## This the fixed cost
p <- 1 - fc^(1/competitors)
y <- rbinom(n = n, size = competitors, prob = p)
# Objective function
fmin <- function(beta, X, y, w, competitors) {
p <- 1 - ilogit(X %*% beta)^(1/competitors) # Function of parameters
-sum(2 * w * ifelse(y, log(p), log(1 - p)))
}
b.start <- runif(length(betas))
(out <- optim(b.start, fmin, X = X, y = y, competitors = competitors,
w = rep(1, length(y)))$par)## [1] -12.8788774 0.4197166 0.9890749 2.1442844
rbind(out/competitors, betas)## [,1] [,2] [,3] [,4]
## -2.575775 0.08394333 0.197815 0.4288569
## betas -2.500000 0.10000000 0.250000 0.5000000
Suppose F is lognormal(\(\mu, \sigma\)), then the zero profit condition is:
\[\begin{align} \left[\frac 12 +\frac 12 \left(\frac{\log{m}-\mu}{\sqrt{2\sigma}}\right)\right] = a^{-1/(n-1)} K_l \end{align}\]# xxx
zeroprofit <- function(x, mu, sigma, n) {
x * plnorm(x, meanlog=mu, sdlog=sigma)^(n-1)
}
for (i in 3:10)
curve(zeroprofit(x, i, 10, 10), add=TRUE, lty=i)
# Clear
rm(list = ls())
# Libraries
require(xtable)
require(moments)
require(races)
# Data
data(races)
attach(races)
# TABLE 1.
size <- ave(submit, paste(treatment, room), FUN = length)
(m <- aggregate(submit ~ room + size + treatment, FUN = sum))## room size treatment submit
## 1 1 9 race 2
## 2 2 10 race 2
## 3 3 10 race 5
## 4 4 10 race 1
## 5 5 15 race 4
## 6 6 15 race 3
## 7 7 15 race 4
## 8 8 15 race 5
## 9 1 10 tournament 3
## 10 2 10 tournament 5
## 11 3 10 tournament 5
## 12 4 10 tournament 2
## 13 5 15 tournament 5
## 14 6 15 tournament 4
## 15 7 15 tournament 4
## 16 8 15 tournament 5
## 17 1 10 reserve 1
## 18 2 10 reserve 3
## 19 3 10 reserve 3
## 20 4 10 reserve 2
## 21 5 15 reserve 4
## 22 6 15 reserve 6
## 23 7 15 reserve 3
## 24 8 15 reserve 5
We now generalize the contest game introduced by @moldovanu2001optimal to a situation where players simultaneously decide \(i)\) the quality and \(ii)\) how fast to produce a given output. Then we explore the problem of revenue maximization faced by a contest designer with preferences for both quality and time.
In this section, we solve the model for the unique symmetric Bayesian Nash equilibrium of players.
At equilibrium, players choose \(\quality\) and \(t_i\) by maximizing given their beliefs about the equilibrium actions of the other players.
The key to characterization of the equilibrium is that \(t_i=\deadline\) is a (weakly) dominant strategy for each player. Indeed, any time strictly below the deadline does not affect the probability of winning but is costly in terms of effort, and any time strictly above the deadline gives a negative payoff.
Since players should avoid playing weakly dominated strategies, we have that \(t_i=\deadline\) for every \(i=1,...,n\). Then it can be shown that, at equilibrium, the optimal quality is a one-to-one transformation of a player’s ability according to some “bidding” function \(b(\cdot)\). Hence, \(Y_i = b(A_i)\). Since the distribution of \(A_i\) is known, one can use a change of variables formula to express the probability of winning in in terms of the distribution \(F\). Then, the first order condition of \(\pi\) with respect to quality gives the following differential equation in \(\ability\) (for every \(i=1,...,n\)):
\[ \sum_{k=1}^{q} \hat p^\prime_{k}(\phi(y_i)\mid \tournament) \phi^\prime(y_i) v_k = \beta y_i^{\beta-1} a^\alpha t^\gamma \]
with the boundary condition \(y(\lotype)=0\).
At equilibrium we have \(a=\phi(y)\). Thus, the differential equation is separable. After rearranging, we have:
\[ \sum_{k=1}^{q}\phi(y_i)^{-\alpha}\hat p^\prime_{k}(\phi(y_i)\mid \tournament)\phi^\prime(y_i) v_k = \beta y_i^{\beta-1} t^\gamma. \]
Integration of both sides with respect to \(\phi\) gives:
\[ \int_{0}^{y} \sum_{k=1}^{q} t^{-\gamma} v_k \phi(x)^{-\alpha}\hat p^\prime_{k}(\phi(x)\mid \tournament)\phi^\prime(x) dx = \int_{0}^{y} \beta x^{\beta-1} dx \]
Using the chain rule of derivatives
\[ \int_a^b f(g(x)) g^\prime(x) dx = \int_{g(a)}^{g(b)} f(x) dx \qquad \text{(chain rule)} \]
the solution is
\[\begin{equation} \label{optimal quality tournament} y(a_i) = \left[\deadline^{-\gamma} \sum_{k=1}^q v_k \int x^{-\alpha} \hat p^\prime_k(x)dx \right]^{1/\beta} \end{equation}\]An important property of is that \(y(a_i)\) has its lower bound in zero and its upper bound in
\[ y(\hitype) = \left[\deadline^{-\gamma} v_1 \left( \frac{\hitype^{-\alpha+1} - \lotype^{-\alpha+1}}{-\alpha + 1} \right)\right]^{1/\beta}. \]
Monotonicity of the equilibrium output quality implies that, for every \(i=1, ..., n\), the equilibrium expected payoff from the contest \(\pi_i^*\) depends on the rank of the player’s ability relative to the others.
When abilities are distributed over the unit interval, the equilibrium payoff for the highest ability player is \(\pi(\hitype=1) = v_1 (-\alpha)/ (1-\alpha)\).
In a similar way, one can derive the equilibrium strategy in a race. Again the key observation is that any quality below the target gives a zero probability of winning and any quality above the target gives a constant probability of winning. Thus, player \(i\)’s choice of quality is either zero (with \(t_i=\deadline\) by convention) or \(y_i=\target\).
Then, the equilibrium xxx for player \(i\) is \[\begin{equation} \label{tstar} t^*(a_i) = \ctime^{-1} \left[\ctime(\deadline) + \frac{1}{\cscore(\target)} \left(\alpha \int_{a_i}^{\hitype} A^\prime(z) dz + (1-\alpha) \int_{a_i}^{\hitype} B^\prime(z) dz \right) \right] \end{equation}\] where \[\begin{equation} A(x) = \frac{1}{c_{a}(x)} f_{(n-1:n-1)}(x) \end{equation}\] and \[\begin{equation} B(x) = \frac{1}{c_{a}(x)} \left\{ \left[1- F_{(n-1:n-1)}(x)\right]f_{(n-1:n-2)}(x) + f_{(n-1:n-1)}(x) F_{(n-1:n-2)}(x) \right\}. \end{equation}\] An important property of XX is that \(y^*(a_i)\) has its upper bound in XX and lower bound in XX. Again payoffs are xxxx. Hence, by setting to zero and solving for the ability, gives the marginal ability \({\underline a}\) as \[\begin{equation} {\underline a}= h(n, V, F_A, C, d). \end{equation}\]By comparing equilibrium xxx and xxx, we find that the race and the tournament do not (ex-post) dominate one another with respect to output quality. Whereas the race always dominates the tournament with respect to completion time. [This is only when the deadline is the same. Otherwise, there’s always xxxx.] This result is stated below.
Let’s make an example.
p <- plnorm # pdf individual abilities
r <- rlnorm # Simulate individual abilities
cy <- function(x) x^2 # Cost function performance
ct <- function(x) 2*exp(1-x) # Cost function timing
FIGURE 1. Equilibrium bids in a race and a tournament.
Implications. The above proposition applies only if the target is higher in a race than in a tournament. But what if the two competitions had the same target ? In that case, tournaments and races have the same marginal type. Therefore, the performance of players in the tournament with reserve are always non-lower than those in the race. This does not imply that it is optimal to set the target. On the contrary, we will show that it is optimal to set an optimal target in a tournament that is below the optimal target in a race. Next section.
Let us now focus on the contest designer’s problem. Imagine the contest designer can choose the competition format to be either the race or the tournament. Imagine all other aspects of design are given. The prize structure \(\alpha\) has been already chosen. There is a deadline \(\deadline\), which is the same in both competition formats. [The quality requirement \(\target^c\) in the tournament is smalle than that in the race \(\target^\race > \target^\tournament\))] We will relax this assumption later to consider a more general setting where these variables are also part of the contest designer’s problem.
The contest designer has an objective function that is increasing in the expected quality of the winning solution and decreasing in the corresponding time to completion. Here, to do not complicate exposition, we assume that the contest designer cares about the winning submission only: second placed efforts are not considered. [If the principal values the diversity of the solutions … but we assume it does not.]
XXX EQUATION XXXX
The optimal choice involves a comparison of the expected profits between the race and the tournament. Given xxxx, we can show that there will be a threshold on the cost of completion time \(\hat\tau\) above which the race is a better choice than the tournament, and vice versa.
Proof. In a tournament, the objective function is \[\begin{align} R_\tournament & = \Pr(t_{(1:n)}\leq \deadline) \left\{\int y^*(x \mid t_{(1:n)}\leq \deadline) dF_{n:n}(x) - \tau \deadline - 1 \right\} \nonumber\\ & = \int_{\mtype}^{\hitype} y^*(x) dF_{n:n}(x) - \tau \deadline - 1. \end{align}\]That is, the contest designer’s objective function is the sum of the expected output quality for a given deadline, minus the cost \(\tau\) of having the winner working on the task until completion (i.e., until the deadline), and the cost of the prize pool (recall the prize pool is normalized to one).
[Implicitly, you’re assuming that the prize is always large enough to ensure positive effort.] [Second prize too is stochastic!!!!]
In a race, the objective function is \[\begin{align} R_\race & = \Pr(a_{(N)}\geq \mtype) \left\{\target - \alpha - \Pr(a_{(N-1)}\geq \mtype) (1-\alpha) \right\} - \tau \int_{\mtype}^{\infty} t^*(x) dF_{N:N}(x) \nonumber\\ & = [1-F_{N:N}(\mtype)] \left\{\target - \alpha - [1-F_{N-1:N}(\mtype)] (1 - \alpha) \right\} - \tau \int_{\mtype}^{\infty} t^*(x) dF_{N:N}(x). \end{align}\] Note. \(t^*(x) \leq \deadline\) for all \(x\)’s. Thus, a lower bound for the above objective function can be computed: \[\begin{align} \underline {R_\race} & = [1-F_{N:N}(\mtype)] \left\{\target - \alpha - [1-F_{N-1:N}(\mtype)] (1 - \alpha) - \tau \deadline\right\} \end{align}\]An even simpler lower bound is rewriting the above expression as if \(\alpha=1\) (note if the real alpha was set 1 then also mtype would change and therefore setting alpha hits a lower bound only when mtype does xxxx when alpha is 1).
Note. \(y^*(x)\) is lower than \(\target\) for all \(a < \mtype\). Thus, a lower bound of the tournament’s expression is \[\begin{align} \overline {R_\tournament} & = [1-F_{N:N}(\mtype)] \target + \int_{\mtype}^\infty y^*(x) dF_{N:N}(x) - \tau \deadline - 1. \end{align}\] \[\begin{align} \underline {R_\race} \geq & \overline {R_\tournament} \nonumber\\ [1-F_{N:N}(\mtype)] (\target - 1 - \tau \deadline) \geq & [1-F_{N:N}(\mtype)] \target + \int_{\mtype}^\infty y^*(x) dF_{N:N}(x) - \tau \deadline - 1 \nonumber\\ - [1-F_{N:N}(\mtype)] (\tau\deadline + 1) \geq & \int_{\mtype}^\infty y^*(x) dF_{N:N}(x) - (\tau \deadline + 1) \nonumber\\ F_{N:N}(\mtype) (\tau \deadline + 1) \geq & \int_{\mtype}^\infty y^*(x) dF_{N:N}(x) \nonumber\\ \tau \geq & \left[ \frac{\int_{\mtype}^\infty y^*(x) dF_{N:N}(x)}{F_{N:N}(\mtype)} -1 \right] \frac{1}{\deadline} \end{align}\]End proof.
When the cost of time \(\tau\) is sufficiently high, the race is preferred. Interestingly, the threshold is a function of the deadline to complete the job, as xxx. It also depends on the shape of xxxx.
Now we turn to discuss the contest designer’s choice of an optimal minimum requirement \(\target\). So far, we have assumed that \(\target^\race>\target^\tournament\). Now, we show that the assumption that xxxx is indeed an optimal choice of the contest designer. This is summarized in the next proposition.
To prove that it is indeed the case. We proceed in two steps. First, we assume that the contest designer does not care about minimizing the timing of the innovation by imposing \(\tau = 0\). For simplicity, assume that \(\alpha=1\) (winner-takes-all). In a race, this means that the optimal target will be a value that makes equal the costs in terms of less participation versus the gains in terms of higher values of the winning solutions. Formally, the contest designer’s problem in a race is \[\begin{align} \text{maximize } & R^\race = [1-F_{N:N}(\mtype)] (\target^\race - 1). \end{align}\] Note that \(\mtype\) depends on the target. This is clearly concave in \(\target^\race\). Thus, the first order condition is also sufficient. \[\begin{align}\label{foc race} \text{FOC } & \Rightarrow -F^\prime_{N:N}(\mtype) \mtype^\prime (\target^\race - 1) + [1-F_{N:N}(\mtype)] = 0. \end{align}\] In a tournament, … \[\begin{align} \text{maximize } & R^\race = \int_{\mtype}^\infty y^*(x, \target) d F_{N:N}(x) - [1-F_{N:N}(\mtype)]. \end{align}\]Convexity is not sure. If not, then the optimal target is zero. Which is lower than the optimal target in a race.
Instead. If the objective function is (strictly) concave then there’s an internal solution.
\[\begin{align} \label{foc tournament} \text{FOC } \Rightarrow & \frac{d\int_{\mtype}^\infty y^*(x, \target) d F_{N:N}(x)) }{d \target} + F^\prime_{N:N}(\mtype) \mtype^\prime =0 \nonumber\\ & \text{(by using Leibniz rule)}\nonumber\\ \Rightarrow & - y^*(\mtype, \target) \mtype^\prime F^\prime_{N:N}(\mtype) + \int_{\mtype}^\infty \dystar - F^\prime_{N:N}(\mtype) \mtype^\prime = 0\nonumber\\ \Rightarrow & -\target \mtype^\prime F^\prime_{N:N}(\mtype) + \int_{\mtype}^\infty \dystar - F^\prime_{N:N}(\mtype) \mtype^\prime = 0. \end{align}\] Using with , the optimal target is the same in the race and the tournament only if \[\begin{align} \int_{\mtype}^\infty \dystar = [1- F_{N:N}(\mtype)]. \end{align}\]\[ \frac{\partial y^*(x, \target)}{\partial \target} = \frac{c_y^\prime(\target)}{c_y^\prime(y^*(x, \target))}. \]
Then.
If \(c_y(\cdot)\) is linear, we have that the ratio is one for all \(x\).
If \(c_y(\cdot)\) is convex, then we have that it is less than one. If
If \(c_y(\cdot)\) is concave, then we have that it is higher than one.
As a result, if linear or convex the first order condition is lower than that in the race. Since the obj. function is concave (second order is decreasing), the target should be lower in a tournament than in a race to satisfy the first order condition. (a lower target increases the focs.).
Conjecture. If \(\tau>0\), the \(\target\) in the race is higher.
Readings:
The winner’s curse, reserve prices, and endogenous entry: Empirical insights from eBay auctions
Entry and competition effects in first-price auctions: theory and evidence from procurement auctions
General two-step strategy:
First step. Identify the marginal type from the data and the distribution of types.
Second step. Using the estimated distribution of types.
with \(y^*(\cdot)\) being an invertible function with \(\phi\) denoting the inverse.
Hence the distribution of bids is \[\begin{equation} F_{Y}(y) = \Pr(y_i^* \leq y) = \Pr(y^*(a_i) \leq y) = \Pr(a_i \leq \phi(y)) = F_\mathcal{A}( \phi(y)). \end{equation}\]Identification of the model. suggest
data(scores) dat <- merge(scores, races[, c(“coder_id”, “treatment”)], by=‘coder_id’) attach(dat)
time_l <- split(timestamp, coder_id)
difftime2 <- function(x) { n <- length(x) init <- as.POSIXct(“2015-03-08 12:00:00 EDT”) y <- x - c(init, x[-n]) units(y) <- “hours” return(as.numeric(y)) }
interval <- unlist(tapply(timestamp, coder_id, FUN=difftime2)) boxplot(interval ~ treatment, outline=FALSE)
sapply(interval_l, mean) sapply(interval_l, median)
interval_l <- split(interval, treatment) kruskal.test(interval_l) # Significant!!!!
In the standard tobit model [@amemiya1985advanced], we have:
\[ y_i = \begin{cases} y_l & \text{if } y^*_i < y_l\\ y^*_i & \text{if } y^*_i \geq y_l\\ \end{cases} \]
Where \(y^* = x\beta + \epsilon\), assuming disturbance \(\epsilon \sim N(0, \sigma^2)\)
Let denote the standard normal distribution by \(\Phi(\cdot)\) and the density function by \(\phi(\cdot)\). The likelihood is:
\[ \mathcal{L} = \prod_{i}^n \left(\sigma^{-1}\phi\left(\frac{y_i - X_i\beta}{\sigma}\right)\right) ^{I(y_i)} \left(1 - \Phi\left(\frac{X_i\beta - y_l}{\sigma}\right)\right) ^{1-I(y_i)} \]
where \(I(\cdot)\) is an indicator function that is 1 when \(y_i > y_l\) and 0 otherwise.
require(AER)
# Example with left censoring
n <- 1000
yl <- 0.1
x1 <- runif(n)
x2 <- rchisq(n, 1)
betas <- c(0.25, -0.1)
simulate.tobit <- function(n, betas, yl, x1, x2) {
ystar <- rnorm(n, mean = cbind(x1, x2) %*% betas)
y <- ifelse(ystar < yl, yl, ystar)
return(y)
}
# Try
y <- simulate.tobit(n, betas, yl = 0.1, x1, x2)
summary(tobit(y ~ x1 + x2 - 1, left = yl))##
## Call:
## tobit(formula = y ~ x1 + x2 - 1, left = yl)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 1000 525 475 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## x1 0.26721 0.06823 3.916 8.99e-05 ***
## x2 -0.09012 0.02347 -3.840 0.000123 ***
## Log(scale) -0.06132 0.03513 -1.746 0.080872 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 0.9405
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 3
## Log-likelihood: -992 on 3 Df
## Wald-statistic: 21.03 on 1 Df, p-value: 4.5294e-06
# Simulations
coef.sim <- replicate(499, {
y <- simulate.tobit(n, betas, yl, x1, x2)
coef(tobit(y ~ x1 + x2 - 1, left = yl)) - betas
})
# Consistency looks good
boxplot(t(coef.sim), ylim = 0.5 * c(-1, 1))# Inference looks good
apply(coef.sim, 1, sd)## x1 x2
## 0.07140839 0.02619388